{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T08:46:15.870525Z",
     "iopub.status.busy": "2021-09-23T08:46:15.869957Z",
     "iopub.status.idle": "2021-09-23T08:46:17.144580Z",
     "shell.execute_reply": "2021-09-23T08:46:17.143481Z",
     "shell.execute_reply.started": "2021-09-23T08:46:15.870474Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import lightgbm as lgb\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:47.059292Z",
     "iopub.status.busy": "2021-09-23T09:45:47.058720Z",
     "iopub.status.idle": "2021-09-23T09:45:49.058405Z",
     "shell.execute_reply": "2021-09-23T09:45:49.057914Z",
     "shell.execute_reply.started": "2021-09-23T09:45:47.059242Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('./input/train_public.csv')\n",
    "train_internet = pd.read_csv('./input/train_internet.csv')\n",
    "submit_example = pd.read_csv('./input/submit_example.csv')\n",
    "test_public = pd.read_csv('./input/test_public.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:49.059490Z",
     "iopub.status.busy": "2021-09-23T09:45:49.059230Z",
     "iopub.status.idle": "2021-09-23T09:45:49.064369Z",
     "shell.execute_reply": "2021-09-23T09:45:49.063841Z",
     "shell.execute_reply.started": "2021-09-23T09:45:49.059464Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "loan_id                       int64\n",
       "user_id                       int64\n",
       "total_loan                  float64\n",
       "year_of_loan                  int64\n",
       "interest                    float64\n",
       "monthly_payment             float64\n",
       "class                        object\n",
       "employer_type                object\n",
       "industry                     object\n",
       "work_year                    object\n",
       "house_exist                   int64\n",
       "censor_status                 int64\n",
       "issue_date                   object\n",
       "use                           int64\n",
       "post_code                     int64\n",
       "region                        int64\n",
       "debt_loan_ratio             float64\n",
       "del_in_18month                int64\n",
       "scoring_low                 float64\n",
       "scoring_high                float64\n",
       "known_outstanding_loan        int64\n",
       "known_dero                    int64\n",
       "pub_dero_bankrup            float64\n",
       "recircle_b                  float64\n",
       "recircle_u                  float64\n",
       "initial_list_status           int64\n",
       "app_type                      int64\n",
       "earlies_credit_mon           object\n",
       "title                         int64\n",
       "policy_code                   int64\n",
       "f0                          float64\n",
       "f1                          float64\n",
       "f2                          float64\n",
       "f3                          float64\n",
       "f4                          float64\n",
       "early_return                  int64\n",
       "early_return_amount           int64\n",
       "early_return_amount_3mon    float64\n",
       "isDefault                     int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:49.348936Z",
     "iopub.status.busy": "2021-09-23T09:45:49.348409Z",
     "iopub.status.idle": "2021-09-23T09:45:49.361695Z",
     "shell.execute_reply": "2021-09-23T09:45:49.360910Z",
     "shell.execute_reply.started": "2021-09-23T09:45:49.348888Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "work_year_dict = {\n",
    "    '1 year': 1,\n",
    "    '2 years': 2,\n",
    "    '3 years': 3,\n",
    "    '4 years': 4,\n",
    "    '5 years': 5,\n",
    "    '6 years': 6,\n",
    "    '7 years': 7,\n",
    "    '8 years': 8,\n",
    "    '9 years': 9,\n",
    "    '10+ years': 10,\n",
    "}\n",
    "\n",
    "train_data['work_year'] = train_data['work_year'].map(work_year_dict)\n",
    "test_public['work_year'] = test_public['work_year'].map(work_year_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:49.839241Z",
     "iopub.status.busy": "2021-09-23T09:45:49.838717Z",
     "iopub.status.idle": "2021-09-23T09:45:49.848979Z",
     "shell.execute_reply": "2021-09-23T09:45:49.848096Z",
     "shell.execute_reply.started": "2021-09-23T09:45:49.839193Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "class_dict = {\n",
    "    'A': 1,\n",
    "    'B': 2,\n",
    "    'C': 3,\n",
    "    'D': 4,\n",
    "    'E': 5,\n",
    "    'F': 6,\n",
    "    'G': 7,\n",
    "}\n",
    "\n",
    "train_data['class'] = train_data['class'].map(class_dict)\n",
    "test_public['class'] = test_public['class'].map(class_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:50.179123Z",
     "iopub.status.busy": "2021-09-23T09:45:50.178595Z",
     "iopub.status.idle": "2021-09-23T09:45:50.195344Z",
     "shell.execute_reply": "2021-09-23T09:45:50.194702Z",
     "shell.execute_reply.started": "2021-09-23T09:45:50.179075Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])\n",
    "test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])\n",
    "\n",
    "train_data['issue_date_month'] = train_data['issue_date'].dt.month\n",
    "test_public['issue_date_month'] = train_data['issue_date'].dt.month\n",
    "\n",
    "train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek\n",
    "test_public['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:50.620548Z",
     "iopub.status.busy": "2021-09-23T09:45:50.620024Z",
     "iopub.status.idle": "2021-09-23T09:45:50.631333Z",
     "shell.execute_reply": "2021-09-23T09:45:50.630522Z",
     "shell.execute_reply.started": "2021-09-23T09:45:50.620500Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "col_to_drop = ['issue_date', 'earlies_credit_mon']\n",
    "train_data = train_data.drop(col_to_drop, axis=1)\n",
    "test_public = test_public.drop(col_to_drop, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:45:51.038973Z",
     "iopub.status.busy": "2021-09-23T09:45:51.038429Z",
     "iopub.status.idle": "2021-09-23T09:45:51.052285Z",
     "shell.execute_reply": "2021-09-23T09:45:51.051465Z",
     "shell.execute_reply.started": "2021-09-23T09:45:51.038923Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "cat_cols = ['employer_type', 'industry']\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "for col in cat_cols:\n",
    "    lbl = LabelEncoder().fit(train_data[col])\n",
    "    train_data[col] = lbl.transform(train_data[col])\n",
    "    test_public[col] = lbl.transform(test_public[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:53:54.033463Z",
     "iopub.status.busy": "2021-09-23T09:53:54.032924Z",
     "iopub.status.idle": "2021-09-23T09:53:54.045430Z",
     "shell.execute_reply": "2021-09-23T09:53:54.044759Z",
     "shell.execute_reply.started": "2021-09-23T09:53:54.033413Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "def k_fold_serachParmaters(model,train_data, train_label, test_data):\n",
    "    n_splits=5\n",
    "    \n",
    "    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)\n",
    "    pred_Test = np.zeros(len(test_data))\n",
    "    \n",
    "    auc_train, auc_val = 0, 0\n",
    "    for tr_idx, val_idx in sk.split(train_data, train_label):\n",
    "        x_train = train_data.iloc[tr_idx]\n",
    "        y_train = train_label.iloc[tr_idx]\n",
    "        x_val = train_data.iloc[val_idx]\n",
    "        y_val = train_label.iloc[val_idx]\n",
    "\n",
    "        model.fit(x_train, y_train, \n",
    "                  eval_set=[(x_val, y_val)], \n",
    "                  categorical_feature = cat_cols,\n",
    "                 early_stopping_rounds=100,\n",
    "                 verbose=False)\n",
    "\n",
    "        pred_Test += model.predict_proba(test_data)[:, 1]/n_splits\n",
    "\n",
    "        pred = model.predict(x_val)\n",
    "        auc_val += roc_auc_score(y_val,pred)/n_splits\n",
    "        \n",
    "        pred = model.predict(x_train)\n",
    "        auc_train += roc_auc_score(y_train, pred)/n_splits\n",
    "        \n",
    "        \n",
    "    return auc_val, pred_Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T09:58:29.407697Z",
     "iopub.status.busy": "2021-09-23T09:58:29.407115Z",
     "iopub.status.idle": "2021-09-23T09:59:11.342754Z",
     "shell.execute_reply": "2021-09-23T09:59:11.342326Z",
     "shell.execute_reply.started": "2021-09-23T09:58:29.407646Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6379297058384644\n",
      "0.6378193432101231\n",
      "0.6411947923481109\n",
      "0.6379297058384644\n",
      "0.639650980531848\n",
      "0.6379297058384644\n",
      "0.6397703258341599\n",
      "0.6388898032496009\n",
      "0.6388898032496009\n",
      "0.6388898032496009\n",
      "0.6411947923481109\n",
      "0.6398875480086581\n",
      "0.6399286984891315\n",
      "0.6411947923481109\n",
      "0.6388898032496009\n",
      "0.639650980531848\n",
      "0.6379297058384644\n",
      "0.6379297058384644\n",
      "0.6399286984891315\n",
      "0.6397703258341599\n"
     ]
    }
   ],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "score_tta = None\n",
    "score_list = []\n",
    "\n",
    "tta_fold = 20\n",
    "for _ in range(tta_fold):\n",
    "    clf = lgb.LGBMClassifier(\n",
    "        num_leaves=np.random.randint(6, 10), min_child_samples= np.random.randint(2,5),\n",
    "        max_depth=7,learning_rate=0.01,\n",
    "        n_estimators=2050,n_jobs=-1)\n",
    "\n",
    "    score, test_pred = k_fold_serachParmaters(clf,\n",
    "                           train_data.drop(['loan_id', 'user_id', 'isDefault'], axis=1),\n",
    "                           train_data['isDefault'],\n",
    "                           test_public.drop(['loan_id', 'user_id',], axis=1),\n",
    "                          )\n",
    "\n",
    "    print(score)\n",
    "    if score_tta is None:\n",
    "        score_tta = test_pred/tta_fold\n",
    "    else:\n",
    "        score_tta += test_pred/tta_fold\n",
    "    score_list.append(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T10:00:31.618833Z",
     "iopub.status.busy": "2021-09-23T10:00:31.618257Z",
     "iopub.status.idle": "2021-09-23T10:00:31.625106Z",
     "shell.execute_reply": "2021-09-23T10:00:31.624021Z",
     "shell.execute_reply.started": "2021-09-23T10:00:31.618782Z"
    }
   },
   "outputs": [],
   "source": [
    "test_public['isDefault'] = score_tta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-23T10:01:41.359180Z",
     "iopub.status.busy": "2021-09-23T10:01:41.358611Z",
     "iopub.status.idle": "2021-09-23T10:01:41.379453Z",
     "shell.execute_reply": "2021-09-23T10:01:41.378784Z",
     "shell.execute_reply.started": "2021-09-23T10:01:41.359131Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_public.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('aaa.csv', index=None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
